PART 2

2.2 Does the degree distribution appear to follow a power law or a Poisson? Explain and comment by showing suitable visual and numerical evidence that supports your reasoning.

# here we import the cum_dev matrices (computed and stored before) for all the graphs.
D1 = as.data.frame(read.csv("distributions_matrices/cum_dev_matrix_1.csv"))
D2 = as.data.frame(read.csv("distributions_matrices/cum_dev_matrix_2.csv"))
D3 = as.data.frame(read.csv("distributions_matrices/cum_dev_matrix_3.csv"))
D4 = as.data.frame(read.csv("distributions_matrices/cum_dev_matrix_4.csv"))
D5 = as.data.frame(read.csv("distributions_matrices/cum_dev_matrix_5.csv"))

Below are reported two plots showing respectively empirical and cumulative degree distributions for all the 5 generated graphs.

par(mfrow=c(1,2))
# Create a first line
plot(x = log(D1$V1), y=log(D1$V3), type = "l", col = "red", lty=1, lwd=2,
     xlab = "K in log scale", ylab = "cumulative in log scale",
     main="Cumulative in log-log scale")
# Add a second line
lines(x = log(D2$V1), y=log(D2$V3),  col = "blue", type = "l", lwd=2, lty = 2)
# Add a third line
lines(x = log(D3$V1), y=log(D3$V3),  col = "green", type = "l", lwd=2, lty = 3)
# Add a fourth line
lines(x = log(D4$V1), y=log(D4$V3),  col = "orange", type = "l", lwd=2, lty = 4)
# Add a fifth line
lines(x = log(D5$V1), y=log(D5$V3),  col = "purple", type = "l", lwd=2, lty = 5)
# Add a legend to the plot
legend("topright", legend=c("Graph 1", "Graph 2", "Graph 3", "Graph 4", "Graph 5"),
      col=c("red", "blue", "green", "orange", "purple"), lty = 1:5, cex=0.5, bty='o',
      bg='lightgrey', title = "Graphs:")


# Create a first line
plot(x = log(D1$V1), y=log(D1$V2), type = "l", col = "red", lty=1, lwd=2,
     xlab = "K in log scale", ylab = "empirical in log scale",
     main="Empirical in log-log scale")#, log = 'xy')
# Add a second line
lines(x = log(D2$V1), y=log(D2$V2),  col = "blue", type = "l", lwd=2, lty = 2)
# Add a third line
lines(x = log(D3$V1), y=log(D3$V2),  col = "green", type = "l", lwd=2, lty = 3)
# Add a fourth line
lines(x = log(D4$V1), y=log(D4$V2),  col = "orange", type = "l", lwd=2, lty = 4)
# Add a fifth line
lines(x = log(D5$V1), y=log(D5$V2),  col = "purple", type = "l", lwd=2, lty = 5)
# Add a legend to the plot
legend("topright", legend=c("Graph 1", "Graph 2", "Graph 3", "Graph 4", "Graph 5"),
       col=c("red", "blue", "green", "orange", "purple"), lty = 1:5, cex=0.5, bty='o',
       bg='lightgrey', title = "Graphs:")

As a preliminary analysis we simply try to see how close our distributions are to a power law visually. For that reason we plotted the regression lines on the distributions and see how close they are to a straight line.

# CODE TO PLOT GRAPHS DISTRIBUTIONS AND LINEAR REGRESSION:

#(1) plot cumulative for graph 1
p1 <- ggplot(D1, aes(x=log(V1), y=log(V3))) +
  geom_point(size=1, color="#DE8CF0")+
  geom_smooth(method=lm, se=FALSE, linetype="dashed", color="#BED905")+ 
  ggtitle("Cumulative & fitting line for Graph 1") +
  xlab("K in log scale") + ylab("Cumulative in log scale") +
  theme(
    plot.title = element_text(color="#DE8CF0", size=8, face="bold.italic"),
    axis.title.x = element_text(color="black", size=6, face="bold"),
    axis.title.y = element_text(color="black", size=6, face="bold")
  )

#(2) plot cumulative for graph 2
p2 <- ggplot(D2, aes(x=log(V1), y=log(V3))) +
  geom_point(size=1, color="#525B56")+
  geom_smooth(method=lm, se=FALSE, linetype="dashed", color="#BE9063")+ 
  ggtitle("Cumulative & fitting line for Graph 2") +
  xlab("K in log scale") + ylab("Cumulative in log scale") +
  theme(
    plot.title = element_text(color="#525B56", size=8, face="bold.italic"),
    axis.title.x = element_text(color="black", size=6, face="bold"),
    axis.title.y = element_text(color="black", size=6, face="bold")
  )

#(3) plot cumulative for graph 3
p3 <- ggplot(D3, aes(x=log(V1), y=log(V3))) +
  geom_point(size=1, color="#00743F")+
  geom_smooth(method=lm, se=FALSE, linetype="dashed", color="#F2A104")+ 
  ggtitle("Cumulative & fitting line for Graph 3") +
  xlab("K in log scale") + ylab("Cumulative in log scale") +
  theme(
    plot.title = element_text(color="#00743F", size=8, face="bold.italic"),
    axis.title.x = element_text(color="black", size=6, face="bold"),
    axis.title.y = element_text(color="black", size=6, face="bold")
  )

#(4) plot cumulative for graph 4
p4 <- ggplot(D4, aes(x=log(V1), y=log(V3))) +
  geom_point(size=1, color="#36688D")+
  geom_smooth(method=lm, se=FALSE, linetype="dashed", color="#F49F05")+ 
  ggtitle("Cumulative & fitting line for Graph 4") +
  xlab("K in log scale") + ylab("Cumulative in log scale") +
  theme(
    plot.title = element_text(color="#36688D", size=8, face="bold.italic"),
    axis.title.x = element_text(color="black", size=6, face="bold"),
    axis.title.y = element_text(color="black", size=6, face="bold")
  )

#(5) plot cumulative for graph 5
p5 <- ggplot(D5, aes(x=log(V1), y=log(V3))) +
  geom_point(size=1, color="#0294A5")+
  geom_smooth(method=lm, se=FALSE, linetype="dashed", color="#C1403D")+ 
  ggtitle("Cumulative & fitting line for Graph 5") +
  xlab("K in log scale") + ylab("Cumulative in log scale") +
  theme(
    plot.title = element_text(color="#0294A5", size=8, face="bold.italic"),
    axis.title.x = element_text(color="black", size=6, face="bold"),
    axis.title.y = element_text(color="black", size=6, face="bold")
  )

grid.arrange(p1, p2, p3, p4, p5, nrow=2, ncol=3)

WARNING: The following coefficients and the intercepts are shown only to give an idea of the slope of the regression lines, and are not to be taken as a serious statistical test.

graph_names <- c("graph 1","graph 2","graph 3","graph 4","graph 5")
lm_function <- rep(0,5) # vector we are stored intercept and coef. of linear regression
df_list = list(D1, D2, D3, D4, D5)
counter = 1

for (df in df_list){
  c_lm <- lm(V3~V1, data = log(df[-nrow(df),])) # removing last row because it has 0
  strg <- paste("y = ",toString(round(c_lm$coefficients[[1]],2)),
                paste(toString(round(c_lm$coefficients[[2]],2)), 'x', sep=""),
                sep=" ")
  lm_function[counter] = strg
  counter = counter +1
}



# Plot table with summary of regression lines
kable(data.frame(cbind(graph_names, lm_function)), caption= "Fitted regression lines")
Fitted regression lines
graph_names lm_function
graph 1 y = 13.8 -1.85x
graph 2 y = 13.76 -1.85x
graph 3 y = 13.96 -1.89x
graph 4 y = 13.83 -1.86x
graph 5 y = 14.02 -1.91x

By digging into the igraph library we found a function called fit_power_law(), that fits the empirical data with a power law distribution and then computes the goodness of the fit with the Kolmogorov–Smirnov test. This should return a more robust statistical result. The function returns the KS_stat and the p-value. The KS_stat tells us how well the power law distribution fits our data, i.e to a lower value better corresponds a better fit. However, this measure alone is not enough to tell how likely it is that data is drawn from a power law. For that reason the function also returns the p-value. One has to choose a confidence interval \(\alpha\) while running the test, usually the used values are 1%, 3%, 5%. Getting a value higher then the chosen \(\alpha\) means that the fit is “good” and the higher the value is better the fit (sometimes we even get 1 when we rounded to two decimal places).

P.S. We are both looking forward to learning more about these kind of tests during the rest of the course. Below it is reported a summary table with the KS_statistics and the p-value:

Kolmogorov-Smirnov Goodness of Fit Test
graph_names ks_stat p_value
graph 1 0.0248 0.9994
graph 2 0.0266 0.9984
graph 3 0.0331 0.9696
graph 4 0.029 0.9935
graph 5 0.0415 0.8575

As a comparison, below is returned the result of a KS test on a graph generated with \(\gamma\) = 1 (this means that every new link is added uniformly at random to already existing node).

graph_gamma = graph_generator(200000, 1) 
cm_em_gamma = cumulative_empirical(graph_gamma)

write.csv(graph_gamma, "graphs_matrices/graph_gamma.csv", row.names = F)
write.csv(cm_em_gamma, glue("distributions_matrices/cm_em_gamma.csv"), row.names = F)
cm_em_gamma = as.data.frame(read.csv("distributions_matrices/cm_em_gamma.csv"))
print(glue("p-value: {round(fit_power_law(cm_em_gamma[,2], xmin=1)$KS.p,4)}"))
## p-value: 0.0013

It is clear from the following plot, and supported by the p-value from the KS-test, that the generated graph does not follow a power law distribution:

p_gamma <- ggplot(cm_em_gamma, aes(x=log(V1), y=log(V3))) +
  geom_point(size=1, color="#0294A5")+
  geom_smooth(method=lm, se=FALSE, linetype="dashed", color="#C1403D")+ 
  ggtitle(expression(paste("Cumulative & fitting line for ",gamma," =1"))) +
  xlab("K in log scale") + ylab("Cumulative in log scale") +
  theme(
    plot.title = element_text(color="#0294A5", size=25, face="bold.italic"),
    axis.title.x = element_text(color="black", size=10, face="bold"),
    axis.title.y = element_text(color="black", size=10, face="bold")
  )
p_gamma

The visualization of a huge graph is always a big issue, for that reason it has been decided to plot only the edges of two graphs, one generated using \(\gamma\)=.5 and the other one with \(\gamma\)= 1, both having 100000 nodes/edges.

We have also plotted two graphs, with the same generating processes, but with 1.000.000 edges. To see the details, they required to be saved with a very high resolution that does not fit with the R markdown. For that reason we haven’t included them into the markdown but can be found inside the images folder under the names “power_law.png” and “not_power_law.png”. Of course, in order to see the details it is necessary to zoom deeply into every picture.

As it is possible to see from the figure below, there is a significant difference between the two graphs:

  • graph B: the edges in the graph that follows a power law distribution has a small number of very high degree nodes and the edges are concentrated around a small number of very high degree nodes.

  • graph A: in the graph on the left this is not the same, in fact it is not possible to distinguish a high concentration of edges around few nodes.

# (1) graph with gamma = 1
g_100000_random <- graph_generator(100000,gamma=1)
g_graph_100000_random <- graph_from_edgelist(cbind(g_100000_random$one,g_100000_random$two), directed = FALSE)# we have inserted directed=FALSE just because we don't want to plot the edges arrows in the visualization

# (2) graph graph with gamma = .5
g_100000 <- graph_generator(100000)
g_graph_100000 <- graph_from_edgelist(cbind(g_100000$one,g_100000$two), directed = FALSE) # we have inserted directed=FALSE just because we don't want to plot the edges arrows in the visualization
png("images/100000_edges_high_resolution.png", width = 16000, height = 13193)

par(mfrow=c(1,2))

# (1) graph with gamma = 1
plot(g_graph_100000_random, vertex.size = 0, edge.size = 0.001,
     vertex.label=NA, vertex.frame.color=NA, vertex.color =NA,
     edge.color='black')
title(main='Graph A',line = -40, cex.main=30)

# (2) graph graph with gamma = .5
plot(g_graph_100000, vertex.size = 0, edge.size = 0.001, vertex.label=NA,
     vertex.frame.color=NA, vertex.color =NA, edge.color='black')

title(main='Graph B', line = -40, cex.main=30)
dev.off()

The figure below has been exported, by setting an high resolution, and rendered here in the markdown as a separated image.